import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
import config
from config import BasePaths
from CellLineWork.CellLineExpressionMatrix import CellLineExpressionMatrix
import scipy
from scipy.cluster.hierarchy import linkage, dendrogram
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
import math
import pandas as pd
import pickle
exp_obj = CellLineExpressionMatrix().execute()
def create_clustering(expression_mat, show=False, p=100):
# Create correlation based distance matrix
corr_dist_matrix = scipy.spatial.distance.cdist(expression_mat.transpose(),
expression_mat.transpose(), 'correlation')
corr_dist_condensed = scipy.spatial.distance.pdist(expression_mat.transpose(), metric='correlation')
print(corr_dist_matrix.shape, len(corr_dist_condensed),
expression_mat.shape, expression_mat.shape[1]*(expression_mat.shape[1]-1)/2)
# Create tree
corr_tree = linkage(corr_dist_condensed, method='average')
# Create dendrogram
corr_dn = dendrogram(corr_tree)
if show:
fig, ax = plt.subplots()
ax.set_title('Hierarchical Clustering Dendrogram (truncated)')
ax.set_xlabel('sample index')
ax.set_ylabel('distance')
dendrogram(
corr_tree,
truncate_mode='lastp', # show only the last p merged clusters
p=p, # show only the last p merged clusters
show_leaf_counts=False, # otherwise numbers in brackets are counts
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches
ax=ax
)
plt.show()
return corr_dist_matrix, corr_tree, corr_dn
def show_correlation_heatmap(expression_matrix, dendogram, corr_matrix=None, vmax=None, vmin=None):
plt.subplot()
if corr_matrix is None:
corr_matrix = np.corrcoef(expression_matrix.iloc[:, dendogram['leaves']].transpose())
mask = corr_matrix==1
if vmin is None:
vmin = -max(np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 95)),
np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 0.05)))
if vmax is None:
vmax = max(np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 95)),
np.absolute(np.percentile(corr_matrix[corr_matrix!=1], 0.05)))
print('vmax={}, vmin={}'.format(vmax, vmin))
sns.heatmap(corr_matrix, xticklabels=False, yticklabels=False, cmap='seismic', vmax=vmax, vmin=vmin)
print(corr_matrix.shape)
else:
sns.heatmap(pd.DataFrame(corr_matrix).iloc[dendogram['leaves'], dendogram['leaves']],
xticklabels=False, yticklabels=False, cmap='seismic', vmin=-1, vmax=1)
plt.show()
full_dist, full_tree, full_dn = create_clustering(exp_obj.expression_matrix, show=True)
show_correlation_heatmap(exp_obj.expression_matrix, full_dn)
show_correlation_heatmap(exp_obj.expression_matrix, full_dn, vmin=-0.3, vmax=0.3)
plt.figure(figsize=(10,10))
ax = plt.subplot()
dendrogram(
full_tree,
truncate_mode='lastp', # show only the last p merged clusters
p=50, # show only the last p merged clusters
show_leaf_counts=False, # otherwise numbers in brackets are counts
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches,
ax=ax
)
ax.set_ylim(bottom=0.74, top=0.84)
plt.show()
cell_lines_path = BasePaths.CellLineExpression
columns_to_compare=['GE_CCLE_match', 'SNP_CL_match']
cell_id_column="sample_id"
def get_cohesive_cell_lines(csv_path, columns_to_compare, id_column):
cell_lines = pd.read_csv(csv_path)
cohesive_cells_lines = cell_lines.loc[
cell_lines.apply(lambda x: len(np.unique(x[columns_to_compare])) == 1, axis=1)]
return cohesive_cells_lines[[id_column, columns_to_compare[0]]].rename(
columns={columns_to_compare[0]: "CellLine"})
cohesive_cell_lines = get_cohesive_cell_lines(cell_lines_path, columns_to_compare, cell_id_column)
cell_lines = set(cohesive_cell_lines['CellLine'].values)
cohesive_cell_lines.groupby('CellLine').count().sort_values(by='sample_id', ascending=False)
def get_cohesive_expression_obj(exp_obj, cohesive_cell_lines):
cohesive_cell_indices = list(map(lambda x: x in cohesive_cell_lines.sample_id.values, exp_obj.cells_matrix))
cohesive_cell_indices = [c[0] for c in enumerate(cohesive_cell_indices) if c[1]]
cohesive_exp_matrix = exp_obj.expression_matrix.loc[:, cohesive_cell_indices]
print(cohesive_exp_matrix.shape, exp_obj.expression_matrix.shape)
return cohesive_exp_matrix
def change_cells_index_to_barcodes(exp_mat, cell_name):
exp_mat = exp_mat.transpose()
exp_mat['barcode'] = list(map(lambda x: cell_name[x], exp_mat.index))
exp_mat.set_index('barcode', inplace=True)
return exp_mat
cohesive_exp_matrix = get_cohesive_expression_obj(exp_obj, cohesive_cell_lines)
cohesive_exp_matrix = change_cells_index_to_barcodes(cohesive_exp_matrix, exp_obj.cells_matrix)
cohesive_exp_matrix['CellLine'] = cohesive_cell_lines.set_index('sample_id').loc[cohesive_exp_matrix.index]
cohesive_exp_matrix.head()
cell_lines_dict = {}
for cell_line in set(cohesive_exp_matrix.CellLine.values):
exp_mat = cohesive_exp_matrix.loc[
cohesive_exp_matrix.CellLine==cell_line,
[col for col in cohesive_exp_matrix.columns if col!='CellLine']].transpose()
if exp_mat.shape[1]>=50:
cell_lines_dict[cell_line] = exp_mat
for cell_line, exp_matrix in cell_lines_dict.items():
print('working on {}'.format(cell_line))
corr_dist, tree, dn = create_clustering(exp_matrix, show=True)
show_correlation_heatmap(exp_matrix, dn, vmin=0.0)
class HasName: pass
in_path = HasName()
in_path.name = ''
for pipe_step in exp_obj.composing_items[:2]:
in_path.name = pipe_step.out_file_path(in_path)
print(in_path.name)
with open(in_path.name, 'rb') as in_file:
original_exp_obj = pickle.load(in_file)
original_exp_obj
original_exp_obj.expression_matrix = change_cells_index_to_barcodes(
original_exp_obj.expression_matrix,
original_exp_obj.cells_matrix).transpose()
original_exp_obj.expression_matrix.head()
tmp_dict = {}
for cell_line, exp_matrix in cell_lines_dict.items():
print("working on cell line {}".format(cell_line))
tmp_original_exp = original_exp_obj.expression_matrix.loc[exp_matrix.index, exp_matrix.columns]
gene_sum_values = tmp_original_exp.values
gene_sum_values = gene_sum_values.mean(axis=1)
gene_sum_values += 1
gene_sum_values = np.log2(gene_sum_values)
fig, ax = plt.subplots(2, 1)
ax[0].plot(range(len(gene_sum_values)), np.sort(gene_sum_values))
ax[1].hist(gene_sum_values, bins=30)
plt.show()
tmp_dict[cell_line] = exp_matrix.iloc[gene_sum_values>4, :]
cell_lines_dict = tmp_dict
cell_line_name = 'ONCODG1_OVARY'
single_cell_line_exp = cell_lines_dict[cell_line_name]
single_cell_line_exp.head()
# Change from log2((TPM/10)+1) to TPM
tpm_data = single_cell_line_exp.copy()
tpm_data = tpm_data.apply(lambda x: 10*((2**x)-1))
tpm_data.head()
exp_obj.composing_items
tmp_original_exp = original_exp_obj.expression_matrix.loc[single_cell_line_exp.index, single_cell_line_exp.columns]
gene_sum_values = tmp_original_exp.values
gene_sum_values = gene_sum_values.mean(axis=1)
gene_sum_values += 1
gene_sum_values = np.log2(gene_sum_values)
fig, ax = plt.subplots(2, 1)
ax[0].plot(range(len(gene_sum_values)), np.sort(gene_sum_values))
ax[1].hist(gene_sum_values, bins=30)
plt.show()
According to these results we will add create a second threshold of 3
centered_cell_lines_dict = {}
for cell_line, cell_line_exp in cell_lines_dict.items():
centered_cell_lines_dict[cell_line] = {}
centered_cell_lines_dict[cell_line]['exp_mat'] = cell_line_exp.sub(cell_line_exp.mean(axis=1), axis=0)
for cell_line, exp_matrix in centered_cell_lines_dict.items():
print('working on {}, of shape {}'.format(cell_line, exp_matrix['exp_mat'].shape))
corr_dist, tree, dn = create_clustering(exp_matrix['exp_mat'], show=True)
centered_cell_lines_dict[cell_line]['corr_dist'] = corr_dist
centered_cell_lines_dict[cell_line]['tree'] = tree
centered_cell_lines_dict[cell_line]['dn'] = dn
show_correlation_heatmap(exp_matrix['exp_mat'], dn)
For each gene we compute the t_test between cluster and non_cluster cells. Meaningul clusters are of p-value lower than $10^{-4}$ or $10^{-5}$
def calculate_jaccard_sim_binary(a, b):
return np.logical_and(a,b).sum() / float(np.logical_or(a, b).sum())
class Tree:
def __init__(self, linkage_mat, exp_matrix, dend, bottom_threshold=5, top_threshold_percentage=80.0):
self.linkage_matrix = linkage_mat
self.exp_matrix = exp_matrix
self.dn = dend
self.num_original_elements = self.linkage_matrix.shape[0]+1
self.children_dict = {}
self.get_children_dict()
self.bottom_threshold = bottom_threshold
self.top_threshold_percentage = top_threshold_percentage
self.top_threshold = int(math.ceil(self.top_threshold_percentage*self.num_original_elements/100.0))
print("Using boundaries of {} and {}".format(self.bottom_threshold, self.top_threshold))
self.filter_clusters_by_size()
def get_children_dict(self):
for i in range(self.num_original_elements):
self.children_dict[i] = set([i])
for i in range(self.linkage_matrix.shape[0]):
child_1, child_2, _, num_elements = self.linkage_matrix[i]
try:
self.children_dict[i+self.num_original_elements] = self.children_dict[
int(child_1)].union(self.children_dict[int(child_2)])
if not len(self.children_dict[i+self.num_original_elements])==num_elements:
print("wrong number of elements")
raise IOError
except:
print("failed at cluster #{}".format(i))
print(i+1+self.num_original_elements)
print(child_1)
def filter_clusters_by_size(self):
self.filtered_clusters = {}
for clust_id, cluster in self.children_dict.items():
if len(cluster)>=self.bottom_threshold and len(cluster)<=self.top_threshold:
self.filtered_clusters[clust_id] = cluster
def create_differential_expression_matrix(self, p_value_threshold=10**-4, fold_change_threshold=2):
differential_expression_per_cluster = {}
most_significant_gene_express = pd.DataFrame(
index=self.exp_matrix.index)
for clust_id, cluster in self.filtered_clusters.items():
# print(clust_id)
in_clust = centered_cell_lines_dict[cell_line]['exp_mat'].iloc[:, list(cluster)]
out_clust = centered_cell_lines_dict[cell_line]['exp_mat'].iloc[
:, list(set(range(
centered_cell_lines_dict[cell_line]['exp_mat'].shape[1])) - cluster)]
diff_exp_genes = pd.DataFrame(data = in_clust.index, columns=['gene_number'])
diff_exp_genes['ttest_res'] = diff_exp_genes['gene_number'].apply(
lambda x: scipy.stats.ttest_ind(in_clust.loc[x], out_clust.loc[x]))
diff_exp_genes['t_statistic'] = diff_exp_genes.ttest_res.apply(lambda x: x[0])
diff_exp_genes['p_value'] = diff_exp_genes.ttest_res.apply(lambda x: x[1])
most_significant_gene_express[clust_id] = diff_exp_genes['p_value']
diff_exp_genes['fold_change'] = diff_exp_genes['gene_number'].apply(
lambda x: np.absolute(np.mean(in_clust.loc[x]) - np.mean(out_clust.loc[x])))
diff_exp_genes['is_significant'] = diff_exp_genes.apply(
lambda x: x['p_value']<=p_value_threshold and x['fold_change']>=fold_change_threshold, axis=1)
# print("cluster {}, {} differentially expressed genes".format(
# clust_id,diff_exp_genes['is_significant'].sum()))
differential_expression_per_cluster[clust_id] = diff_exp_genes
self.most_significant_gene_express = most_significant_gene_express
self.differential_expression_per_cluster = differential_expression_per_cluster
self.most_significant_cluster_per_gene = self.most_significant_gene_express.apply(
lambda x: x.idxmin(skipna=True), axis=1).dropna()
def get_significant_clusters(self, min_significant_genes=30, min_most_significant_genes=5, **kwargs):
if not hasattr(self, 'differential_expression_per_cluster'):
self.create_differential_expression_matrix(**kwargs)
significant_clusters = pd.DataFrame(index=['num_significant_genes', 'num_most_significant_genes'])
for clust_id, cluster in self.filtered_clusters.items():
try:
num_significant_genes = self.differential_expression_per_cluster[clust_id]['is_significant'].sum()
except:
print('tmp.differential_expression_per_cluster {}, clust_id {}, res'.format(
self.differential_expression_per_cluster, clust_id,
self.differential_expression_per_cluster[clust_id]['is_significant'].sum()))
raise
num_most_significant_genes = sum(self.most_significant_cluster_per_gene==clust_id)
if num_significant_genes>=min_significant_genes or \
num_most_significant_genes>=min_most_significant_genes:
significant_clusters[clust_id] = [num_significant_genes, num_most_significant_genes]
print(significant_clusters.shape)
significant_clusters = significant_clusters.transpose()
significant_clusters = significant_clusters.reset_index().rename(
columns={'index':'cluster_id'}).set_index('cluster_id')
significant_clusters.sort_values(
by=['num_significant_genes', 'num_most_significant_genes'],
inplace=True, ascending=False)
self.significant_clusters = significant_clusters
def visualize_meaningful_clusters(self, **kwargs):
if not hasattr(self, 'significant_clusters'):
self.get_significant_clusters(**kwargs)
cells_vs_clusters = pd.DataFrame(data = np.array(
[False]*self.num_original_elements*self.significant_clusters.shape[0]).reshape(
self.significant_clusters.shape[0], self.num_original_elements),
index=self.significant_clusters.index,
columns=range(self.num_original_elements))
for cluster_id in self.significant_clusters.index:
cells_vs_clusters.loc[cluster_id].iloc[list(self.children_dict[cluster_id])] = True
cells_vs_clusters = cells_vs_clusters.iloc[:, centered_cell_lines_dict[cell_line]['dn']['leaves']]
cm = matplotlib.colors.LinearSegmentedColormap.from_list('BinaryColormap', ['black', 'yellow'], N=2)
self.cm = cm
fig, ax = plt.subplots(2, 1, figsize=(8, 16))
corr_matrix = np.corrcoef(
self.exp_matrix.iloc[
:, self.dn['leaves']].transpose())
mask = corr_matrix==1
sns.heatmap(
corr_matrix,
xticklabels=False,
yticklabels=False,
cmap='seismic',
vmax=0.3,
vmin=-0.3,
ax=ax[0])
print("significant shape matrix {}".format(cells_vs_clusters.shape))
sns.heatmap(cells_vs_clusters, cmap=cm, ax=ax[1], linewidths=0.01)
ax[1].set_xlabel('Cells')
ax[1].get_xaxis().set_ticks(range(cells_vs_clusters.shape[1]))
ax[1].get_yaxis().set_ticks(range(cells_vs_clusters.shape[0]))
ax[1].tick_params(labelbottom=False, labelleft=False)
self.cells_vs_clusters = cells_vs_clusters
def compute_intersection_between_clusters(self, maximal_interaction=0.85):
intersections_between_clusters = pd.DataFrame(
index=self.cells_vs_clusters.index,
columns=self.cells_vs_clusters.index)
for row in intersections_between_clusters.index.values:
for col in intersections_between_clusters.columns.values:
intersections_between_clusters.loc[row, col] = calculate_jaccard_sim_binary(
self.cells_vs_clusters.loc[row], self.cells_vs_clusters.loc[col])
intersections_between_clusters -= np.identity(intersections_between_clusters.shape[0])
clusters_to_keep = set(intersections_between_clusters.index)
print(len(clusters_to_keep))
while len(np.where(intersections_between_clusters>maximal_interaction)[0])>0:
clust_1 = intersections_between_clusters.index[np.where(intersections_between_clusters>0.85)[0][0]]
clust_2 = intersections_between_clusters.index[np.where(intersections_between_clusters>0.85)[1][0]]
print(clust_1, clust_2)
if self.significant_clusters.loc[
clust_1].num_significant_genes > self.significant_clusters.loc[clust_2].num_significant_genes:
intersections_between_clusters.drop(labels=[clust_2], axis=0, inplace=True)
intersections_between_clusters.drop(labels=[clust_2], axis=1, inplace=True)
elif self.significant_clusters.loc[
clust_1].num_significant_genes < self.significant_clusters.loc[clust_2].num_significant_genes:
intersections_between_clusters.drop(labels=[clust_1], axis=0, inplace=True)
intersections_between_clusters.drop(labels=[clust_1], axis=1, inplace=True)
else:
if self.significant_clusters.loc[
clust_1].num_most_significant_genes > self.significant_clusters.loc[
clust_2].num_most_significant_genes:
intersections_between_clusters.drop(labels=[clust_2], axis=0, inplace=True)
intersections_between_clusters.drop(labels=[clust_2], axis=1, inplace=True)
else:
print("clusters {} and {} are weirdly similar".format(clust_1, clust_2))
intersections_between_clusters.drop(labels=[clust_1], axis=0, inplace=True)
intersections_between_clusters.drop(labels=[clust_1], axis=1, inplace=True)
self.intersections_between_clusters = intersections_between_clusters
fig, ax = plt.subplots(2, 1, figsize=(8, 10))
corr_matrix = np.corrcoef(
self.exp_matrix.iloc[
:, self.dn['leaves']].transpose())
mask = corr_matrix==1
sns.heatmap(
corr_matrix,
xticklabels=False,
yticklabels=False,
cmap='seismic',
vmax=0.5,
vmin=-0.5,
ax=ax[0])
print('intersection shape matrix {}'.format(self.cells_vs_clusters.loc[self.intersections_between_clusters.index].shape))
sns.heatmap(
self.cells_vs_clusters.loc[self.intersections_between_clusters.index],
cmap=self.cm, ax=ax[1], linewidths=0.01)
ax[1].set_xlabel('Cells')
ax[1].get_xaxis().set_ticks(range(self.cells_vs_clusters.shape[1]))
ax[1].get_yaxis().set_ticks(range(self.intersections_between_clusters.shape[0]))
ax[1].tick_params(labelbottom=False, labelleft=False)
plt.tight_layout()
plt.show()
cell_line = 'KALS1_CENTRAL_NERVOUS_SYSTEM'
tmp = Tree(centered_cell_lines_dict[cell_line]['tree'],
centered_cell_lines_dict[cell_line]['exp_mat'],
centered_cell_lines_dict[cell_line]['dn'])
tmp.visualize_meaningful_clusters()
tmp.compute_intersection_between_clusters()
trees_per_cell_line = {}
for cell_line, cell_line_dict in centered_cell_lines_dict.items():
print("working on cell line {}".format(cell_line))
ret = Tree(cell_line_dict['tree'],
cell_line_dict['exp_mat'],
cell_line_dict['dn'])
ret.visualize_meaningful_clusters()
ret.compute_intersection_between_clusters()
trees_per_cell_line[cell_line] = ret
sort clusters by score
We are moving our work now from per-cell to per-program (where a program is defined by a cluster). This requires the following steps:
cell_line = 'TE14_OESOPHAGUS'
trees_per_cell_line[cell_line].__dict__.keys()
trees_per_cell_line[cell_line].differential_expression_per_cluster[231]['cluster_id']=231
trees_per_cell_line[cell_line].differential_expression_per_cluster[231]